# Figures_1_2_A3.R
# John G. Bullock (john@johnbullock.org)
# 2016 August 02

# This file creates Figures 1, 2, and A3 in 
#   Gertler, Aaron L., and John G. Bullock.  2016.  "Reference Rot: An 
#   Emerging Threat to Transparency in Political Science."  PS: Political 
#   Science and Politics.
#
# The particular figure created (1, 2, or A3) depends on the settings of the 
# PERSONAL_AND_USGOV and COMPARE_2014_TO_2016 arguments.  (See lines 17-18.)  

library(grid)
library(lattice)
library(plyr)  # for aaply()
source('URL_dataCoding.R')
if (interactive()) {
  PERSONAL_AND_USGOV   <- TRUE
  COMPARE_2014_TO_2016 <- FALSE
} else {
  clArgs <- commandArgs(trailingOnly = TRUE)
  if (length(clArgs) <= 0) { 
    writeLines(paste0(
      "clArgs has length ", length(clArgs), 
      ".  This length is less than 1, so the script is stopping."))
    stop()
  }
  PERSONAL_AND_USGOV   <- as.logical(clArgs[1])
  COMPARE_2014_TO_2016 <- as.logical(clArgs[2])
}



##############################################################################
# PRELIMINARIES FOR PRINTING THE FIGURE
##############################################################################
if (PERSONAL_AND_USGOV) { 
  filenameStem     <- 'Figure_A3'  # 'linePlotBrokenURLsByYear_breakdownByType'
  PDFtitle         <- 'Broken URLs in the APSR, 2000-2013: Personal Sites vs. Institutional Sites and U.S. Government Sites vs. Other Institutional Sites'
  lineTypes        <-  c('32', 'solid')
} else if (COMPARE_2014_TO_2016) { 
  filenameStem     <- 'Figure_2'   # 'linePlotBrokenURLsByYear_2014vs2016'
  PDFtitle         <- 'Broken URLs in the APSR, 2000-2013: Broken-Link Rates in 2014 and 2016'
  lineTypes        <-  c('32', 'solid')
} else {
  filenameStem     <- 'Figure_1'  # 'linePlotBrokenURLsByYear'
  PDFtitle         <- 'Broken URLs in the APSR, 2000-2013'  
  lineTypes        <-  c('solid', 32)
}
PS_width             <- 9.5
PS_height            <- 12.0
postscriptBackground <- 'transparent'
panelWidth           <- if (PERSONAL_AND_USGOV || COMPARE_2014_TO_2016) list(2.75, "inches") else list(5.50, "inches")
panelHeight          <- if (PERSONAL_AND_USGOV || COMPARE_2014_TO_2016) list(3.10, "inches") else list(2.80, "inches") 
panelLayout          <- if (PERSONAL_AND_USGOV || COMPARE_2014_TO_2016) c(2, 1) else c(1, 1)
xBetween             <- 2 
plotLineColor        <- c('blue', 'blue')  # c('black', 'gray')
plotLineWidth        <- 1
panelBorderCol       <- 'black'
panelBorderWidth     <-  .30
baseCexSize          <- 1.00
xLabSize             <-  .80 * baseCexSize
yLabSize             <-  .80 * baseCexSize
axisTickSize         <-  .40
if (PERSONAL_AND_USGOV || COMPARE_2014_TO_2016) {
  xAxisLabels <- seq(2000, 2012, by = 3)
  xAxisLabels <- strsplit(paste0(xAxisLabels, collapse = '   '), ' ')[[1]]
} else {
  xAxisLabels <- seq(2000, 2012, by = 2)
  xAxisLabels <- strsplit(paste0(xAxisLabels, collapse = '  '), ' ')[[1]]
}
xAxis <- list(
  draw        = TRUE, 
  labels      = c(xAxisLabels, ''), 
  at          = 2000:2013,
  alternating = c(1, 1),
  tck         = c(axisTickSize, 0), 
  col         = panelBorderCol, 
  cex         = xLabSize,
  axs         = 'i')  
yAxisLabels <- seq(20, 80, by = 20)
yAxisLabels <- strsplit(paste0(yAxisLabels, collapse = '  '), ' ')[[1]]
yAxisLabels <- gsub('0', '0%', yAxisLabels)
if (PERSONAL_AND_USGOV) { 
  yAxisLabels <- c('', yAxisLabels, '')
  yAxisAt     <- seq(.1, .9, by = .1)
} else {
  yAxisAt     <- seq(.2, .8, by = .1)
}
yAxis <- list(
  draw   = TRUE,
  alternating = c(1, 1),
  limits = if (PERSONAL_AND_USGOV) c(.00, 1.025) else c(.1, .9),
  labels = NULL,  # better handled in the panel function for this figure 
  at     = yAxisAt, 
  tck    = c(axisTickSize, 0),
  col    = panelBorderCol,
  cex    = yLabSize,
  rot    = 90,
  axs    = 'i')

                    

##############################################################################
# CREATE THE DATA FRAME 
##############################################################################
dataToPlotPersonal <- data.frame(
  year          = rep(2000:2013, 2),
  percentBroken = NA,
  URL_type      = rep(c('personal', 'institutional'), each = 14))
resultsByYearPersonal                 <- with(subset(URL_data, URL_type=='Personal site'), table(result2016, year))
resultsByYearPercentagesPersonal      <- aaply(resultsByYearPersonal, 2, .fun = function (x) { x/sum(x) })         # percentage of broken links per year 
resultsByYearInstitutional            <- with(subset(URL_data, URL_type=='Institutional site'), table(result2016, year))
resultsByYearPercentagesInstitutional <- aaply(resultsByYearInstitutional, 2, .fun = function (x) { x/sum(x) })  # percentage of broken replication links per year 
dataToPlotPersonal$percentBroken[dataToPlotPersonal$URL_type == 'personal']      <- 1 - resultsByYearPercentagesPersonal[, 'Information found'] 
dataToPlotPersonal$percentBroken[dataToPlotPersonal$URL_type == 'institutional'] <- 1 - resultsByYearPercentagesInstitutional[, 'Information found']  

dataToPlotUS_gov <- data.frame(
  year          = rep(2000:2013, 2),
  percentBroken = NA,
  URL_type      = rep(c('US_fedGov', 'otherInst'), each = 14))
resultsByYearUS                   <- with(subset(URL_data, US_fedGov), table(result2016, year))
resultsByYearPercentagesUS        <- aaply(resultsByYearUS, 2, .fun = function (x) { x/sum(x) })         # percentage of broken links per year 
resultsByYearOtherInst            <- with(subset(URL_data, URL_type=='Institutional site' & !US_fedGov), table(result2016, year))
resultsByYearPercentagesOtherInst <- aaply(resultsByYearOtherInst, 2, .fun = function (x) { x/sum(x) })  # percentage of broken replication links per year 
dataToPlotUS_gov$percentBroken[dataToPlotUS_gov$URL_type == 'US_fedGov'] <- 1 - resultsByYearPercentagesUS[, 'Information found'] 
dataToPlotUS_gov$percentBroken[dataToPlotUS_gov$URL_type == 'otherInst'] <- 1 - resultsByYearPercentagesOtherInst[, 'Information found']    

resultsByYear2014            <- with(URL_data, table(result2014, year))
resultsByYear2014Percentages <- aaply(resultsByYear2014, 2, .fun = function (x) { x/sum(x) })                                  # percentage of broken links per year 
resultsByYear2016            <- with(URL_data, table(result2016, year))
resultsByYear2016Percentages <- aaply(resultsByYear2016, 2, .fun = function (x) { x/sum(x) })                                  # percentage of broken links per year 
resultsByYear2014ReplicationOnly            <- with(URL_data[URL_data$referent=='Reproducibility',], table(result2014, year))
resultsByYear2014ReplicationOnlyPercentages <- aaply(resultsByYear2014ReplicationOnly, 2, .fun = function (x) { x/sum(x) })    # percentage of broken replication links per year 
resultsByYear2016ReplicationOnly            <- with(URL_data[URL_data$referent=='Reproducibility',], table(result2016, year))
resultsByYear2016ReplicationOnlyPercentages <- aaply(resultsByYear2016ReplicationOnly, 2, .fun = function (x) { x/sum(x) })    # percentage of broken replication links per year 

if (PERSONAL_AND_USGOV) {
  dataToPlot <- rbind(dataToPlotPersonal, dataToPlotUS_gov)
  dataToPlot$panel <- rep(1:2, each = nrow(dataToPlotPersonal))
} else if (COMPARE_2014_TO_2016) {
  dataToPlot <- data.frame(
    year              = rep(2000:2013, 4),
    percentBroken     = NA,
    URL_checkedInYear = rep(c(2014, 2016), each = 28),
    panel             = rep(qw("overall reproducibility"), each = 14))
  dataToPlot$percentBroken[dataToPlot$URL_checkedInYear==2014 & dataToPlot$panel=='overall']         <- 1 - resultsByYear2014Percentages[, 'Information found'] 
  dataToPlot$percentBroken[dataToPlot$URL_checkedInYear==2016 & dataToPlot$panel=='overall']         <- 1 - resultsByYear2016Percentages[, 'Information found']  
  dataToPlot$percentBroken[dataToPlot$URL_checkedInYear==2014 & dataToPlot$panel=='reproducibility'] <- 1 - resultsByYear2014ReplicationOnlyPercentages[, 'Information found']
  dataToPlot$percentBroken[dataToPlot$URL_checkedInYear==2016 & dataToPlot$panel=='reproducibility'] <- 1 - resultsByYear2016ReplicationOnlyPercentages[, 'Information found']  
} else {
  dataToPlot <- data.frame(
  year                     = rep(2000:2013, 2),
  percentBroken            = NA,
  URL_type                 = rep(c('overall', 'replication'), each = 14))
  dataToPlot$percentBroken[dataToPlot$URL_type == 'overall']     <- 1 - resultsByYear2016Percentages[, 'Information found'] 
  dataToPlot$percentBroken[dataToPlot$URL_type == 'replication'] <- 1 - resultsByYear2016ReplicationOnlyPercentages[, 'Information found']
  dataToPlot$panel <- 1
}



##############################################################################
# DATA ANALYSIS FOR THE PAPER 
##############################################################################
# Increased breakage in the 18-month period, by year in which link was 
# published.  [2016 05 14]
tmp <- cbind(resultsByYear2014Percentages[, 1], resultsByYear2016Percentages[, 1])
tmp <- cbind(tmp, tmp[, 1] - tmp[, 2])
mean(tmp[, 3])

tmpRep <- cbind(resultsByYear2014ReplicationOnlyPercentages[, 1], resultsByYear2016ReplicationOnlyPercentages[, 1])
tmpRep <- cbind(tmpRep, tmpRep[, 1] - tmpRep[, 2])
mean(tmpRep[, 3])

# Fully 16% of all reproducibility links published in the 2006-11 period broke
# between our 2014 examination and our 2016 examination.  But breakage rates
# were much lower for links published in the prior and successive periods. 
# [2016 05 14]

# 2000-05
denom       <- sum(apply(resultsByYear2014ReplicationOnly, 2, sum)[1:6])
newlyBroken <- resultsByYear2014ReplicationOnly[1, ] - resultsByYear2016ReplicationOnly[1, ] 
num         <- sum(newlyBroken[1:6]) 
num/denom    

# 2006-11
denom       <- sum(apply(resultsByYear2014ReplicationOnly, 2, sum)[7:12])
newlyBroken <- resultsByYear2014ReplicationOnly[1, ] - resultsByYear2016ReplicationOnly[1, ] 
num         <- sum(newlyBroken[7:12]) 
num/denom    

# 2012-13
denom       <- sum(apply(resultsByYear2014ReplicationOnly, 2, sum)[13:14])
newlyBroken <- resultsByYear2014ReplicationOnly[1, ] - resultsByYear2016ReplicationOnly[1, ] 
num         <- sum(newlyBroken[13:14]) 
num/denom    


# COMPARING U.S. GOVERNMENT SITES TO OTHER INSTITUTIONAL SITES
with(URL_data, table(URL_type, US_fedGov))  # 97 links to U.S. government sites
apply(resultsByYearUS,        1, sum)       # number of links by result type
apply(resultsByYearOtherInst, 1, sum)       # number of links by result type
resultsByYearPercentagesUS                  # seven years in which 100% of govt. links are broken
cbind(
  resultsByYearPercentagesUS[, 1],
  apply(resultsByYearUS, 2, sum))           # number of links per year



##############################################################################
# PANEL FUNCTION
##############################################################################
myPanel <- function (...) {

  # HORIZONTAL LINE AT 50%
  lsegments(2000, .5, 2013, .5, col = 'grey')  
  
  # PLOT THE DATA
  panel.xyplot(...)  
  
  # TICK MARKS AT TOP AND RIGHT
  if (PERSONAL_AND_USGOV || COMPARE_2014_TO_2016) {
    topTickMarks <- c(2*axisTickSize, axisTickSize, axisTickSize)
  } 
  else {
    topTickMarks <- c(2*axisTickSize, axisTickSize)
  }
  panel.axis(
    side        = "top",
    at          = xAxis$at,
    draw.labels = FALSE,
    half        = FALSE,
    tck         = topTickMarks)
  if (PERSONAL_AND_USGOV || COMPARE_2014_TO_2016) {
    panel.axis(
      side        = "right",
      at          = yAxis$at,
      draw.labels = FALSE,
      half        = FALSE,
      outside     = FALSE,
      tck         = if (panel.number()==1) c(2*axisTickSize, axisTickSize) else axisTickSize)
    if (panel.number() == 2) {
      panel.axis(
        side        = "left",
        at          = yAxis$at,
        draw.labels = FALSE,
        half        = FALSE,
        outside     = FALSE,
        tck         = c(2*axisTickSize, axisTickSize))      
    }
  }
  else {
    panel.axis(
      side        = "right",
      at          = yAxis$at,
      draw.labels = FALSE,
      half        = FALSE,
      outside     = FALSE,
      tck         = axisTickSize)
  }  

  
  # X-AXIS LABELS ABOVE EACH PANEL
  if (COMPARE_2014_TO_2016) {
    grid.text(
      label = if (panel.number()==1) 'All URLs' else 'Reproducibility URLs',
      x     = .5,
      y     = 1.04,
      gp    = gpar(cex = .95*baseCexSize))      
  }
  
  
  # X-AXIS LABELS BELOW THE PANELS
  if (panelLayout[1]==2) {
    grid.text(
      label = 'Year in which Articles Were Published',
      x =  .50,
      y = -.15,  # .17 too big
      gp = gpar(cex = xLabSize))
  }

  
  # Y-AXIS LABELS ON LEFT AND RIGHT
  if (PERSONAL_AND_USGOV || COMPARE_2014_TO_2016) {
    if (panel.number()==1) {
      grid.text(
        label = yAxisLabels,
        x     = unit(-.0450, "npc"),
        y     = unit(yAxis$at, "native"),
        rot   = 90,
        gp    = gpar(cex = yLabSize))      
    }
    else if (panel.number()==2 || COMPARE_2014_TO_2016) {
      grid.text(
        label = yAxisLabels,
        x     = unit(1.0450, "npc"),
        y     = unit(yAxis$at, "native"),
        rot   = 270,
        gp    = gpar(cex = yLabSize))      
    }
  }
  else {
    grid.text(
      label = yAxisLabels,
      x     = unit(-.0260, "npc"),
      y     = unit(yAxis$at, "native"),
      rot   = 90,
      gp    = gpar(cex = yLabSize))
    grid.text(
      label = yAxisLabels,
      x     = unit(1.0260, "npc"),
      y     = unit(yAxis$at, "native"),
      rot   = 270,
      gp    = gpar(cex = yLabSize))
  }  
  
  # LEGEND
  if (PERSONAL_AND_USGOV) {
    legendText <- if (panel.number()==1) c('Personal\U00ADsite URLs', 'Institutional\U00ADsite URLs') else c('U.S. government URLs', 'Other institutional URLs') 
    line_x0    <- unit(2000.75, "native")
    ySolid     <- unit(.15, "native")
  } else if (COMPARE_2014_TO_2016) {
      legendText <- c('Broken\U00ADLink Rates in 2016', 'Broken\U00ADLink Rates in 2014')
      line_x0    <- unit(2000.75, "native")
      ySolid     <- unit(.30, "native")
  } else {
    legendText <- c('All URLs', 'Reproducibility URLs')
    line_x0    <- unit(2001.25, "native")
    ySolid     <- unit(.30, "native")
  }
  line_x1 <- convertWidth(line_x0 + unit(.5000, "native"), "native")
  text_x0 <- convertWidth(line_x1 + unit(.2500, "native"), "native")
  yDashed <- convertHeight(ySolid - unit(.0625, "native"), "native")
  rect_x0 <- convertWidth(line_x0 - unit(.2500, "native"), "native")
  if (PERSONAL_AND_USGOV && panel.number()==1) {
    rect_x1 <- convertWidth(line_x1 + unit(7.00, "native"), "native")
  } else if (PERSONAL_AND_USGOV && panel.number()==2) {
    rect_x1 <- convertWidth(line_x1 + unit(7.50, "native"), "native")
  } else if (COMPARE_2014_TO_2016 && panel.number()==1) {
    rect_x1 <- convertWidth(line_x1 + unit(8.35, "native"), "native")
  } else if (COMPARE_2014_TO_2016 && panel.number()==2) {
    rect_x1 <- convertWidth(line_x1 + unit(8.35, "native"), "native")
  } else { 
    rect_x1 <- convertWidth(line_x1 + unit(3.50, "native"), "native")
  }
  rect_y0 <- convertHeight(yDashed - unit(.05, "native"), "native")
  rect_y1 <- convertHeight(ySolid  + unit(.05, "native"), "native")
  grid.segments(
    x0 = line_x0,
    x1 = line_x1,
    y0 = ySolid,
    y1 = ySolid,
    gp = gpar(lty='solid', lwd=1, col='blue'))
  grid.segments(
    x0 = line_x0,
    x1 = line_x1,
    y0 = yDashed,
    y1 = yDashed,
    gp = gpar(lty='32', lwd=1, col='blue'))
  grid.text(
    label = legendText[1],
    x     = text_x0,
    y     = ySolid,
    just  = c("left", "center"),
    gp    = gpar(cex = yLabSize))
  grid.text(
    label = legendText[2],
    x     = text_x0, 
    y     = yDashed, 
    just  = c("left", "center"),
    gp    = gpar(cex = yLabSize))
  grid.rect(
    x      = rect_x0,
    y      = rect_y0,
    width  = convertWidth(rect_x1 - rect_x0, "native"),
    height = convertHeight(rect_y1 - rect_y0, "native"),
    just   = c("left", "bottom"),
    gp     = gpar(fill = 'transparent', lwd = 0.5))
}



##############################################################################
# DRAW THE FIGURE
##############################################################################
setwd('figureOutput/')
pdf(
  file   = paste0(filenameStem, '.pdf'), 
  width  = PS_width, 
  height = PS_height, 
  paper  = "special", 
  title  = PDFtitle,
  bg     = postscriptBackground)
trellis.par.set(
  axis.components = list(
    left  = list(
      pad1 = 0.35,    # distance between axis ticks and tick labels
      pad2 = 3.00),   # distance between axis labels and big y-axis label
    bottom = list(
      pad1 = 1.00,    # distance between axis ticks and tick labels
      pad2 = 1.80)),  # distance between axis labels and big x-axis label
  axis.line = list(
    alpha = 1, 
    col   = panelBorderCol,    # to eliminate lattice panel border, set col = "white"
    lty   = 1, 
    lwd   = panelBorderWidth),
  par.xlab.text = list(
    cex = if (COMPARE_2014_TO_2016) .875*baseCexSize else .925 * baseCexSize),
  par.ylab.text = list(
    cex = if (COMPARE_2014_TO_2016) .875*baseCexSize else .925 * baseCexSize),
  superpose.line = list(        # governs lines when the "groups" argument is used
    alpha = c(1,1), 
    col   = c('blue', 'blue'),   
    lty   = lineTypes, 
    lwd   = c(1, plotLineWidth)))
brokenURL_percentagesPlot <- xyplot(
  percentBroken ~ year | panel,
  groups   = if (COMPARE_2014_TO_2016) URL_checkedInYear else URL_type,
  layout   = panelLayout,
  between  = list(x = xBetween, y = 0),
  data     = dataToPlot, 
  type     = 'l',  
  strip    = FALSE,
  panel    = myPanel,
  scales   = list(x = xAxis, y = yAxis),
  xlab     = if (panelLayout[1]==1) 'Year in which Articles Were Published' else NULL,
  ylab     = 'Percentage of URLs that Are Broken',
  par.settings = trellis.par.set("clip", list(panel="off", strip="off"))
)
print(
  x            = brokenURL_percentagesPlot, 
  panel.width  = panelWidth, 
  panel.height = panelHeight)
dev.off()